Supplement B: Fitting Models

Published

January 23, 2024

1 Overview

Goal: model the spatial distribution of ranch clusters and populations.

Data: Dr. Macfarlan’s ethnographic data, along with ecological and socio-economic variables.

Method: binomial GLM for occupied vs abandoned, negative binomial for population

We want to know what motivates market-integrated, subsistence ranchers to live where they do. This is framed as a trade-off between market integration and ecology.

2 R Preamble

library(MASS)
library(broom)
library(car)
library(here)
library(gt)
library(patchwork)
library(sf)
library(sfdep)
library(terra)
library(tidyterra)
library(tidyverse)
library(viridis)

Specify consistent plot theme here:

Code
theme_set(theme_bw(12))

theme_update(
  panel.grid = element_blank(),
  plot.margin = margin(2,2,2,2),
  strip.background = element_blank(),
  strip.text = element_text(hjust = 0, size = rel(1.1))
)

And some defaults for table formatting:

Code
rename_models <- function(x){
  
  x |> 
    mutate(
      model = case_when(
        model == "occupied"   ~ "Occupied-Abandoned",
        model == "population" ~ "Population",
        model == "secure"     ~ "Population [+ secure]",
        model == "securex"    ~ "Population [x secure]",
        TRUE ~ NA
      )
    )
  
}

format_double <- function(x){
  
  x |> mutate(across(where(is.double), \(.x){ round(.x, digits = 3) }))
  
}

model_table <- function(x){
  
  x |> 
    gt() |> 
    tab_options(
      column_labels.font.weight = "bold",
      data_row.padding = px(6),
      heading.align = "left",
      heading.border.bottom.style = "hidden",
      row.striping.background_color = "gray96",
      row.striping.include_stub = FALSE,
      # row.striping.include_table_body = TRUE,
      table.align = "left",
      table.border.top.style = "hidden",
      table.border.bottom.style = "hidden"
    ) |> 
    sub_missing(missing_text = "")
  
}

3 Data

3.1 Geopackage

Specify path to geopackage database holding all spatial vector data.

gpkg <- here("data", "choyero.gpkg")

3.2 Dependent variables

  • Occupied vs Abandoned ranches
  • Population size of occupied ranches

Rancheros live in “ranch clusters” occupied by close kin.

ranches <- read_sf(gpkg, layer = "ranches")

clusters <- read_sf(gpkg, layer = "clusters")

3.3 Independent variables

  • Market-integration: cost-distance (sort of) to La Paz and Constitucion (hours)
  • Ecology: cost-distance to springs (hours)

Cost-distance to La Paz/Ciudad Constitucion provides a proxy measure of access to markets where rancheros sell livestock and buy food and equipment. Cost-distance to springs represents habitat suitability for small-scale pastoralist economies. Measuring it is a complicated matter, as property rights are involved and multiple springs are used. To avoid some of that complexity, we take the average travel time from a ranch cluster to its two nearest springs. So, this is our basic trade-off between market-integration and local ecology. For further details about how we estimate this variable, see “R/leastcostpaths.html.”

path_to_cities <- read_sf(gpkg, layer = "path_to_cities")

path_to_springs <- read_sf(gpkg, layer = "path_to_springs")

security <- ranches |>   
  st_drop_geometry() |>
  select(cluster, land) |> 
  mutate(secure = ifelse(land == "secure", 1, 0)) |> 
  group_by(cluster) |> 
  summarize(secure = sum(secure)/n())

clusters <- 
  clusters |> 
  mutate(
    springs = path_to_springs$cost,
    cities = path_to_cities$cost,
    paz = path_to_cities$time_to_paz,
    secure = security$secure
  )

remove(path_to_cities, path_to_springs, security, ranches)

3.4 Topography

For additional context, here’s what the terrain looks like.

elevation <- here("data", "dem_30m.tif") |> rast()

elevation_stats <- global(
  elevation, 
  fun = c("min", "max", "mean", "sd"), 
  na.rm = TRUE
)
Code
elevation_stats |> 
  pivot_longer(
    everything(), 
    names_to = "statistic",
    values_to = "value"
  ) |> 
  gt() |> 
  tab_header("Elevation") |> 
  fmt_number(-statistic, decimals = 1) |> 
  cols_width(statistic ~ px(125)) |> 
  tab_options(
    column_labels.hidden = TRUE,
    data_row.padding = px(6),
    heading.align = "left",
    heading.border.bottom.style = "hidden",
    row.striping.background_color = "gray96",
    row.striping.include_stub = FALSE,
    row.striping.include_table_body = TRUE,
    table.align = "left",
    table.border.top.style = "hidden",
    table.border.bottom.style = "hidden"
  )
Elevation
min 0.0
max 979.6
mean 426.9
sd 142.3
Code
hill <- shade(
  terrain(elevation, "slope", unit = "radians"), 
  terrain(elevation, "aspect", unit = "radians"), 
  angle = 30, 
  direction = seq(45, 360, by = 45)
)

hill <- setValues(hill, scales::rescale(values(hill), to = c(1, 1000)))
hill <- round(hill)
  
ggplot() +
  geom_spatraster(
    data = hill, 
    fill = hcl.colors(1000, "Grays")[values(hill)],
    alpha = 0.75,
    maxcell = Inf
  ) +
  geom_spatraster(data = elevation, alpha = 0.5) +
  scale_fill_hypso_c(
    name = "Elevation (m)",
    palette = "utah_1",
    limits = elevation_stats[, c("min", "max")] |> t() |> c(),
    alpha = 0.5,
    na.value = "transparent"
  ) +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "none"
  )

3.5 Data Summary

data_summary <- function(x, .m, .v){
  
  .f <- list(min = min, max = max, mean = mean, sd = sd)
  
  x |> 
    st_drop_geometry() |> 
    mutate(model = .m) |> 
    group_by(model) |> 
    summarize(across({{ .v }}, .fns = .f)) |> 
    pivot_longer(
      -model, 
      names_sep = "_", 
      names_to = c("variable", "statistic")
    ) |> 
    pivot_wider(
      names_from = statistic,
      values_from = value
    )
  
}

variable_summary <- bind_rows(
  clusters |> data_summary("Binomial", c(occupied, springs, paz)),
  clusters |> filter(population > 0) |> data_summary("Poisson", c(population, springs, cities, secure))
)

remove(data_summary)

write_csv(variable_summary, here("data", "variable-summary.csv"))

variable_summary |> 
  group_by(model) |> 
  gt() |> 
  fmt_number(decimals = 2) |> 
  tab_options(
      column_labels.font.weight = "bold",
      data_row.padding = px(6),
      heading.align = "left",
      heading.border.bottom.style = "hidden",
      row_group.as_column = TRUE,
      table.border.top.style = "hidden",
      table.border.bottom.style = "hidden"
    ) |> 
    sub_missing(missing_text = "")
variable min max mean sd
Binomial occupied 0.00 1.00 0.46 0.50
springs 0.00 1.28 0.23 0.27
paz 0.13 3.18 1.11 0.71
Poisson population 1.00 32.00 8.79 8.11
springs 0.00 0.68 0.11 0.18
cities 0.22 2.68 1.08 0.63
secure 0.00 1.00 0.30 0.43

4 Analysis

4.1 Models

In addition to a binomial model of the status of each ranch cluster (occupied vs abandoned), we fit a negative binomial model of the population count of occupied ranch clusters. The negative binomial is an alternative to standard Poisson models where there is an expectation of dispersion in the residuals. The Poisson model assumes that the variance is equal to the mean, but this is not always the case.

models <- list(
  
  occupied = glm(
    occupied ~ paz + springs,
    family = binomial,
    data = clusters
  ),
  population = glm.nb(
    population ~ cities + springs,
    data = clusters |> filter(population > 0) 
  ),
  secure = glm.nb(
    population ~ (cities + springs) + secure,
    data = clusters |> filter(population > 0) 
  ),
  securex = glm.nb(
    population ~ (cities + springs) * secure,
    data = clusters |> filter(population > 0)
  )
  
)

4.2 ANOVA (LRT)

We conduct a Likelihood Ratio Test of the population models to see if adding land security makes a significant improvement to the model.

lrt <- with(models, anova(population, secure, securex, test = "LRT"))

4.3 Residual Autocorrelation

We use Monte Carlo simulations of Moran’s I to test for spatial autocorrelation in the untransformed residuals of each model.

get_moran <- function(x, .sf){
  
  is_nb <- grepl("Negative Binomial", x$family[["family"]])
  
  if (is_nb) .sf <- .sf |> filter(occupied == 1)
  
  geometry <- st_geometry(.sf)
  
  neighbors <- st_knn(geometry, k = nrow(.sf)-1)
  weights <- st_inverse_distance(neighbors, geometry)
  
  moran <- global_moran_perm(residuals(x), neighbors, weights)
  
  moran |> 
    tidy() |> 
    mutate(variable = "residuals", .before = 1)
  
}

morans_i <- models |> 
  map(\(.x){ get_moran(.x, clusters) }) |> 
  bind_rows(.id = "model")

4.4 Variance Inflation

A model of variance inflation tests for multi-collinearity in the predictors.

vif <- models |> 
  map(car::vif) |> 
  bind_rows(.id = "model")

5 Results

5.1 Model evaluation

Code
model_statistics <- models |> 
  map(glance) |> 
  # glance returns logLik column as numeric for glm and 'logLik" for negbin
  map(\(.x){ mutate(.x, across(everything(), as.numeric)) }) |> 
  bind_rows(.id = "model") |> 
  format_double() |> 
  rename_models() |> 
  rename("n.obs" = nobs) |> 
  relocate(model)

model_statistics |> 
  model_table() |> 
  tab_header(title = "Model Statistics")
Model Statistics
model null.deviance df.null logLik AIC BIC deviance df.residual n.obs
Occupied-Abandoned 99.313 71 -30.488 66.975 73.805 60.975 69 72
Population 48.629 32 -97.659 203.319 209.305 32.737 30 33
Population [+ secure] 48.936 32 -97.577 205.153 212.636 32.775 29 33
Population [x secure] 50.399 32 -97.166 208.333 218.808 32.902 27 33
Code
coefficient_estimates <- models |> 
  map(tidy) |> 
  bind_rows(.id = "model") |> 
  format_double() |> 
  rename_models() |> 
  rename("z value" = statistic) |>
  rename_with(gsub, pattern = "\\.", replacement = " ")

coefficient_estimates |> 
  group_by(model) |> 
  model_table() |> 
  tab_options(row_group.as_column = TRUE) |> 
  tab_stubhead("model") |> 
  tab_header(title = "Coefficient Estimates")
Coefficient Estimates
model term estimate std error z value p value
Occupied-Abandoned (Intercept) -2.489 0.836 -2.977 0.003
paz 2.797 0.757 3.695 0.000
springs -4.803 1.828 -2.628 0.009
Population (Intercept) 2.844 0.238 11.939 0.000
cities -0.802 0.214 -3.753 0.000
springs 0.699 0.673 1.039 0.299
Population [+ secure] (Intercept) 2.883 0.265 10.877 0.000
cities -0.804 0.214 -3.759 0.000
springs 0.677 0.674 1.004 0.315
secure -0.114 0.284 -0.403 0.687
Population [x secure] (Intercept) 3.011 0.301 9.999 0.000
cities -0.919 0.256 -3.590 0.000
springs 0.539 0.742 0.726 0.468
secure -0.548 0.540 -1.014 0.310
cities:secure 0.489 0.611 0.800 0.424
springs:secure -0.164 2.013 -0.081 0.935
Code
lrt <- lrt |> 
  as_tibble() |> 
  rename_with(tolower) |> 
  format_double() |> 
  mutate(
    model = case_when(
      model == "cities + springs"            ~ "Population",
      model == "(cities + springs) + secure" ~ "Population [+ secure]",
      model == "(cities + springs) * secure" ~ "Population [x secure]",
      TRUE ~ NA
    )
  )

lrt |> 
  model_table() |> 
  tab_header(
    title = "Likelihood Ratio Test",
    subtitle = "Alternative: difference in log-likelihood greater than zero"
  ) |> 
  cols_width(model ~ pct(21.5))
Likelihood Ratio Test
Alternative: difference in log-likelihood greater than zero
model theta resid. df 2 x log-lik. test df lr stat. pr(chi)
Population 2.977 30 -195.319


Population [+ secure] 3.003 29 -195.153 1 vs 2 1 0.166 0.684
Population [x secure] 3.130 27 -194.333 2 vs 3 2 0.820 0.663
Code
morans_i <- morans_i |> 
  format_double() |> 
  rename_models() |> 
  rename("rank" = parameter)

morans_i |> 
  select(model, statistic, rank, p.value) |> 
  model_table() |> 
  tab_header(
    title = "Monte Carlo Simulations of Moran's I",
    subtitle = "Alternative: spatial autocorrelation (two-sided)"
  )
Monte Carlo Simulations of Moran's I
Alternative: spatial autocorrelation (two-sided)
model statistic rank p.value
Occupied-Abandoned -0.023 200 0.800
Population -0.082 84 0.336
Population [+ secure] -0.085 104 0.416
Population [x secure] -0.080 89 0.356
Code
vif <- vif |> 
  format_double() |> 
  rename_models()

vif |> 
  model_table() |> 
  tab_header(
    title = "Variance Inflation"
  ) |> 
  cols_width(model ~ pct(21.5))
Variance Inflation
model paz springs cities secure cities:secure springs:secure
Occupied-Abandoned 1.357 1.357



Population
1.071 1.071


Population [+ secure]
1.079 1.082 1.029

Population [x secure]
1.334 1.567 3.832 5.976 2.558

High VIF scores for interaction terms can be ignored.

5.2 Marginal Response

Here we calculate the marginal response of each response variable relative to change in each focal variable while holding all non-focal variables at their mean.

# to make the lines smoother when plotting
densify <- function(x, n = 300) seq(min(x), max(x), length = n)

template <- with(
  clusters, 
  tibble(
    paz = densify(paz),
    cities = densify(cities),
    springs = densify(springs)
  )
)

estimate <- function(model, newdata){
  
  est <- predict(
    model, 
    newdata = newdata, 
    se.fit = TRUE
  )
  
  inv_link <- family(model)$linkinv
  
  est <- with(
    est,
    tibble(
      y  = inv_link(fit),
      hi = inv_link(fit + 2*se.fit),
      lo = inv_link(fit - 2*se.fit)
    )
  )
  
  bind_cols(newdata, est)
  
}

responses <- tibble(
    model = c("occupied", "occupied", "population", "population"),
    variable = c("paz", "springs", "cities", "springs")
  ) |> 
  mutate(
    model_object = models[model],
    data = list(template),
    data = map2(data, variable, \(.x, .y){ .x |> mutate(across(-all_of(.y), mean)) }),
    estimate = map2(model_object, data, estimate),
    estimate = map2(estimate, variable, \(.x, .y){ rename(.x, "x" = all_of(.y)) })
  ) |> 
  select(model, variable, estimate) |> 
  unnest(estimate) |> 
  select(model, variable, x, y, hi, lo)
Code
# add old data to display points
observations <- map(models, \(.x){ .x$model })

observations <- tibble(
    model = c("occupied", "occupied", "population", "population"),
    variable = c("paz", "springs", "cities", "springs")
  ) |> 
  mutate(
    old_data = observations[model],
    old_data = map2(old_data, variable, \(.x, .y){ rename(.x, "x" = all_of(.y)) }),
    old_data = map2(old_data, model,    \(.x, .y){ rename(.x, "y" = all_of(.y)) })
  ) |> 
  unnest(old_data) |> 
  select(model, variable, x, y)

baja_labels <- c(
    "paz" = "To La Paz",
    "springs" = "To Springs",
    "cities" = "To Cities"
)

gg_occupied <- ggplot() +
  geom_ribbon(
    data = responses |> filter(model == "occupied"),
    aes(x, ymin = lo, ymax = hi),
    fill = "#F5F5F5",
    color = "transparent"
  ) +
  geom_line(
    data = responses |> filter(model == "occupied"),
    aes(x, y),
    linewidth = 0.7
  ) +
  geom_point(
    data = observations |> filter(model == "occupied"),
    aes(x, y),
    size = 2.5,
    color = alpha("#2E4756", 0.7)
  ) +
  labs(
    x = "Hours",
    y = "Abandoned-Occupied"
  ) +
  facet_wrap(
    vars(variable), 
    scales = "free_x",
    labeller = labeller(variable = baja_labels)
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank()
  )

gg_population <- ggplot() +
  geom_ribbon(
    data = responses |> filter(model == "population"),
    aes(x, ymin = lo, ymax = hi),
    fill = "#F5F5F5",
    color = "transparent"
  ) +
  geom_line(
    data = responses |> filter(model == "population"),
    aes(x, y),
    linewidth = 0.7
  ) +
  geom_point(
    data = observations |> filter(model == "population"),
    aes(x, y),
    size = 2.5,
    color = alpha("#2E4756", 0.7)
  ) +
  labs(
    x = "Hours",
    y = "Population"
  ) +
  facet_wrap(
    vars(variable),
    scales = "free_x",
    labeller = labeller(variable = baja_labels)
  )

gg <- gg_occupied/gg_population

ggsave(
  plot = gg,
  here("figures", "response-plots.png"),
  dpi = 600,
  width = 5.75,
  height = 5.75
)

6 Session Info

Code
# save the session info as an object
pkg_sesh <- sessioninfo::session_info(pkgs = "attached")

# inject the quarto info
pkg_sesh$platform$quarto <- paste(
  quarto::quarto_version(), 
  "@", 
  quarto::quarto_path()
  )

# print it out
pkg_sesh
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16 ucrt)
 os       Windows 11 x64 (build 22621)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  English_United States.utf8
 ctype    English_United States.utf8
 tz       America/Denver
 date     2024-01-23
 pandoc   3.1.1 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
 quarto   1.4.545 @ C:\\PROGRA~1\\Quarto\\bin\\quarto.exe

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date (UTC) lib source
 broom       * 1.0.5   2023-06-09 [1] CRAN (R 4.3.1)
 car         * 3.1-2   2023-03-30 [1] CRAN (R 4.3.1)
 carData     * 3.0-5   2022-01-06 [1] CRAN (R 4.3.1)
 dplyr       * 1.1.4   2023-11-17 [1] CRAN (R 4.3.1)
 forcats     * 1.0.0   2023-01-29 [1] CRAN (R 4.3.1)
 ggplot2     * 3.4.4   2023-10-12 [1] CRAN (R 4.3.2)
 gt          * 0.10.1  2024-01-17 [1] CRAN (R 4.3.2)
 here        * 1.0.1   2020-12-13 [1] CRAN (R 4.3.1)
 lubridate   * 1.9.3   2023-09-27 [1] CRAN (R 4.3.2)
 MASS        * 7.3-60  2023-05-04 [2] CRAN (R 4.3.1)
 patchwork   * 1.2.0   2024-01-08 [1] CRAN (R 4.3.2)
 purrr       * 1.0.2   2023-08-10 [1] CRAN (R 4.3.1)
 readr       * 2.1.5   2024-01-10 [1] CRAN (R 4.3.2)
 sf          * 1.0-15  2023-12-18 [1] CRAN (R 4.3.2)
 sfdep       * 0.2.3   2023-01-11 [1] CRAN (R 4.3.1)
 stringr     * 1.5.1   2023-11-14 [1] CRAN (R 4.3.1)
 terra       * 1.7-65  2023-12-15 [1] CRAN (R 4.3.1)
 tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.1)
 tidyr       * 1.3.0   2023-01-24 [1] CRAN (R 4.3.1)
 tidyterra   * 0.5.2   2024-01-19 [1] CRAN (R 4.3.1)
 tidyverse   * 2.0.0   2023-02-22 [1] CRAN (R 4.3.1)
 viridis     * 0.6.4   2023-07-22 [1] CRAN (R 4.3.1)
 viridisLite * 0.4.2   2023-05-02 [1] CRAN (R 4.3.1)

 [1] C:/Users/kenne/AppData/Local/R/win-library/4.3
 [2] C:/Program Files/R/R-4.3.1/library

──────────────────────────────────────────────────────────────────────────────